import re # 不用安装（注意！！）
import os # 文件夹等的操作（注意！！）
import time
import requests # http urllib2
from bs4 import BeautifulSoup

url = 'http://www.cae.cn/cae/html/main/col48/column_48_1.html'
html = requests.get(url)
# print(html.status_code) # 状态码200 404 500 502
html.encoding = 'utf-8'
# print(html.text) # 以文本形式返回网页

soup = BeautifulSoup(html.text,"lxml")
links =soup.select("li.name_list > a")
# print(len(links))
list1=list(set(links))
print(len(list1))
# print(list1[0].string)

url2 = 'http://casad.cas.cn/ysxx2017/ysmdyjj/qtysmd_124280/'
html2 = requests.get(url2)
# print(html.status_code) # 状态码200 404 500 502
html2.encoding = 'utf-8'
# print(html.text) # 以文本形式返回网页

soup2 = BeautifulSoup(html2.text,"lxml")
links2 = soup2.find(id="allNameBar")
links2_1 =links2.find_all("a",target="_blank")
print(len(links2_1))
list2=list(set(links2_1))
# print(list2[0].string)

# 保存数据
with open("中国两院院士名单.txt", "w+", encoding="utf-8") as f: # 特别注意这里的要以编码utf-8方式打开
   for i in list1:
      if i.string.find("(") < 0:
        f.write(i.string + '\n')
      else:
        f.write(i.string[:len(i.string)-3] + '\n')
   for j in list2:
      f.write(j.string + '\n')
   